The term Boosting refers to a family of algorithms which converts weak learner to strong learners.
There are many boosting algorithms:
sklearn.ensemble.GradientBoostingRegressor
xgboost.XGBRegressor # fast and best
lightgbm.LGBMRegressor # extreme fast, little acc than xgb
catboost.CatBoostRegressor # good for categorical feats
import time
notebook_start_time = time.time()
import sys
ENV_BHISHAN = None
try:
import bhishan
print('Environment: Personal environment')
ENV_BHISHAN = True
%load_ext autoreload
%autoreload 2
except:
print('Module "bhishan" not found.')
import sys
ENV_COLAB = 'google.colab' in sys.modules
if ENV_COLAB:
#!pip install hpsklearn
!pip install shap eli5
!pip install catboost
!pip install ipywidgets
!jupyter nbextension enable --py widgetsnbextension
# set OMP_NUM_THREADS=1 for hpsklearn package
#!export OMP_NUM_THREADS=1
print('Environment: Google Colab')
import numpy as np
import pandas as pd
SEED = 100
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = 8,8
plt.rcParams.update({'font.size': 16})
plt.style.use('ggplot')
%matplotlib inline
import seaborn as sns
sns.set(color_codes=True)
# boosting
import xgboost as xgb
import lightgbm as lgb
import catboost
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBClassifier, DMatrix
from lightgbm import LGBMClassifier, Dataset
from catboost import CatBoostClassifier, Pool, CatBoost
print([(x.__name__,x.__version__) for x in [xgb, lgb,catboost]])
# six and pickle
import six
import pickle
import joblib
# scale and split
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
# classifiers
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
# sklearn scalar metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
# roc auc and curves
from sklearn.metrics import auc
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import precision_recall_curve
# confusion matrix and classification report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
import time
from hyperopt import hp, tpe, fmin, Trials, STATUS_OK, STATUS_FAIL
from hyperopt.pyll import scope
from hyperopt.pyll.stochastic import sample
import copy
import pprint
pp = pprint.PrettyPrinter(indent=4)
# model intepretation modules
import eli5
import shap
# shap_values = shap.TreeExplainer(model_xgb).shap_values(Xtest)
# shap.summary_plot(shap_values, Xtest)
# shap.dependence_plot("column_name", shap_values, Xtest)
def show_method_attributes(obj, ncols=7,start=None, inside=None):
""" Show all the attributes of a given method.
Example:
========
show_method_attributes(list)
"""
lst = [elem for elem in dir(obj) if elem[0]!='_' ]
lst = [elem for elem in lst
if elem not in 'os np pd sys time psycopg2'.split() ]
if isinstance(start,str):
lst = [elem for elem in lst if elem.startswith(start)]
if isinstance(start,tuple) or isinstance(start,list):
lst = [elem for elem in lst for start_elem in start
if elem.startswith(start_elem)]
if isinstance(inside,str):
lst = [elem for elem in lst if inside in elem]
if isinstance(inside,tuple) or isinstance(inside,list):
lst = [elem for elem in lst for inside_elem in inside
if inside_elem in elem]
return pd.DataFrame(np.array_split(lst,ncols)).T.fillna('')
df_eval = pd.DataFrame({'Model': [],
'Description':[],
'Accuracy':[],
'Precision':[],
'Recall':[],
'F1':[],
'AUC':[],
})
ifile = 'https://github.com/bhishanpdl/Project_Fraud_Detection/blob/master/data/raw/creditcard.csv.zip?raw=true'
df = pd.read_csv(ifile,compression='zip')
print(df.shape)
df.head()
target = 'Class'
features = df.columns.drop(target)
df[target].value_counts(normalize=True)*100
from sklearn.model_selection import train_test_split
df_Xtrain_orig, df_Xtest, ser_ytrain_orig, ser_ytest = train_test_split(
df.drop(target,axis=1),
df[target],
test_size=0.2,
random_state=SEED,
stratify=df[target])
ytrain_orig = ser_ytrain_orig.to_numpy().ravel()
ytest = ser_ytest.to_numpy().ravel()
print(df_Xtrain_orig.shape)
df_Xtrain_orig.head()
df_Xtrain, df_Xvalid, ser_ytrain, ser_yvalid = train_test_split(
df_Xtrain_orig,
ser_ytrain_orig,
test_size=0.2,
random_state=SEED,
stratify=ser_ytrain_orig)
ytrain = ser_ytrain.to_numpy().ravel()
yvalid = ser_yvalid.to_numpy().ravel()
print(df_Xtrain.shape)
# random undersampling
n = df[target].value_counts().values[-1]
df_under = (df.groupby(target)
.apply(lambda x: x.sample(n,random_state=SEED))
.reset_index(drop=True))
df_Xtrain_orig_under, df_Xtest_under, ser_ytrain_orig_under, ser_ytest_under = train_test_split(
df_under.drop(target,axis=1),
df_under[target],
test_size=0.2,
random_state=SEED,
stratify=df_under[target])
df_Xtrain_under, df_Xvalid_under, ser_ytrain_under, ser_yvalid_under = train_test_split(
df_Xtrain_orig_under,
ser_ytrain_orig_under,
test_size=0.2,
random_state=SEED,
stratify=ser_ytrain_orig_under)
ser_ytrain.value_counts(), ser_ytest.value_counts(), ser_yvalid.value_counts()
https://catboost.ai/docs/concepts/python-reference_catboostregressor.html
class CatBoostRegressor(iterations=None,learning_rate=None,depth=None,
l2_leaf_reg=None,model_size_reg=None,rsm=None,loss_function='RMSE',
border_count=None,feature_border_type=None
per_float_feature_quantization=None,input_borders=None,
output_borders=None,fold_permutation_block=None,od_pval=None,
od_wait=None,od_type=None,nan_mode=None,counter_calc_method=None,
leaf_estimation_iterations=None,leaf_estimation_method=None,
thread_count=None,random_seed=None,use_best_model=None,
best_model_min_trees=None,verbose=None,silent=None,logging_level=None,
metric_period=None,ctr_leaf_count_limit=None,store_all_simple_ctr=None,
max_ctr_complexity=None,
has_time=None,allow_const_label=None,one_hot_max_size=None,
random_strength=None,name=None,ignored_features=None,
train_dir=None,custom_metric=None,eval_metric=None,
bagging_temperature=None,save_snapshot=None,
snapshot_file=None,snapshot_interval=None,
fold_len_multiplier=None,used_ram_limit=None,gpu_ram_part=None,
pinned_memory_size=None,allow_writing_files=None,
final_ctr_computation_mode=None,approx_on_full_history=None,
boosting_type=None,simple_ctr=None,combinations_ctr=None,
per_feature_ctr=None,ctr_target_border_count=None,task_type=None,
device_config=None,devices=None,bootstrap_type=None,subsample=None,
sampling_unit=None,dev_score_calc_obj_block_size=None,
max_depth=None,n_estimators=None,num_boost_round=None,
num_trees=None,colsample_bylevel=None,random_state=None,
reg_lambda=None,objective=None,eta=None,max_bin=None,
gpu_cat_features_storage=None,data_partition=None,
metadata=None,early_stopping_rounds=None,cat_features=None,
grow_policy=None,min_data_in_leaf=None,min_child_samples=None,
max_leaves=None,num_leaves=None,score_function=None,
leaf_estimation_backtracking=None,ctr_history_unit=None,
monotone_constraints=None)
import catboost
show_method_attributes(catboost)
from catboost import CatBoostClassifier, Pool
show_method_attributes(CatBoostClassifier)
from catboost import CatBoostClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import accuracy_score, precision_score, recall_score,f1_score
from sklearn.metrics import confusion_matrix
# time
time_start = time.time()
# current parameters
desc = 'default,random_state=100, numpy'
Xtr = df_Xtrain.to_numpy()
ytr = ser_ytrain.to_numpy().ravel()
Xtx = df_Xtest.to_numpy()
ytx = ser_ytest.to_numpy().ravel()
# fit the model
model_cat = CatBoostClassifier(verbose=100,random_state=SEED)
model_cat.fit(Xtr, ytr)
# fitted model
model = model_cat
# save the model
# joblib.dump(model_cat, 'model_cat.pkl')
# model_cat = joblib.load('model_cat.pkl')
# predictions
skf = StratifiedKFold(n_splits=2,shuffle=True,random_state=SEED)
ypreds_cv = cross_val_predict(model_cat, Xtx, ytx, cv=skf)
ypreds = ypreds_cv
# model evaluation
row_eval = ['catboost','default, seed=100',
accuracy_score(ytx, ypreds),
precision_score(ytx, ypreds, average='micro'),
recall_score(ytx, ypreds, average='micro'),
f1_score(ytx, ypreds, average='micro'),
roc_auc_score(ytx, ypreds),
]
df_eval.loc[len(df_eval)] = row_eval
df_eval = df_eval.drop_duplicates()
time_taken = time.time() - time_start
print('Time taken: {:.0f} min {:.0f} secs'.format(*divmod(time_taken,60)))
display(df_eval)
# calculate the FPR and TPR for all thresholds of the classification
from sklearn import metrics
yprobs = model_cat.predict_proba(df_Xtest)
ypreds = yprobs[:,1]
fpr, tpr, threshold = metrics.roc_curve(ytest, ypreds)
roc_auc = metrics.auc(fpr, tpr)
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'red', label = 'ROC AUC score = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'b--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
import eli5
# eli5.explain_weights_catboost(model_cat) # same thing
eli5.show_weights(model_cat)
df_Xtrain.head(2)
# time
time_start = time.time()
# current parameters
Xtr = df_Xtrain
ytr = ser_ytrain.to_numpy().ravel()
Xtx = df_Xtest
ytx = ser_ytest.to_numpy().ravel()
Xvd = df_Xvalid
yvd = ser_yvalid.to_numpy().ravel()
# fit the model
model = CatBoostClassifier(random_state=0,verbose=100)
model.fit(Xtr, ytr,
eval_set=(Xvd, yvd))
# ypreds
skf=StratifiedKFold(n_splits=5,shuffle=True,random_state=SEED)
ypreds = cross_val_predict(model, Xtx, ytx, cv=skf)
# r-squared values
r = roc_auc_score(ytx, ypreds)
# time
time_taken = time.time() - time_start
print('Time taken: {:.0f} min {:.0f} secs'.format(*divmod(time_taken,60)))
print('ROC AUC Score ', r)
catboost tutorials model analysis feature statistics tutorial
# float feature
feature_name = 'Amount'
dict_stats = model.calc_feature_statistics(df_Xtrain, ser_ytrain, feature_name, plot=True)
# feature importance
df_imp = pd.DataFrame({'Feature': features,
'Importance': model.feature_importances_
})
df_imp.sort_values('Importance',ascending=False).style.background_gradient()
def plot_feature_imp_catboost(model_catboost,features):
"""Plot the feature importance horizontal bar plot.
"""
df_imp = pd.DataFrame({'Feature': model.feature_names_,
'Importance': model.feature_importances_
})
df_imp = df_imp.sort_values('Importance').set_index('Feature')
ax = df_imp.plot.barh(figsize=(12,8))
plt.grid(True)
plt.title('Feature Importance',fontsize=14)
ax.get_legend().remove()
for p in ax.patches:
x = p.get_width()
y = p.get_y()
text = '{:.2f}'.format(p.get_width())
ax.text(x, y,text,fontsize=15,color='indigo')
plt.show()
plot_feature_imp_catboost(model, features)
df_fimp = model.get_feature_importance(prettified=True)
df_fimp.head()
plt.figure(figsize=(12,8))
ax = sns.barplot(x=df_fimp.columns[1], y=df_fimp.columns[0], data=df_fimp);
for p in ax.patches:
x = p.get_width()
y = p.get_y()
text = '{:.2f}'.format(p.get_width())
ax.text(x, y,text,fontsize=15,color='indigo',va='top',ha='left')
from catboost import CatBoost, Pool
# help(CatBoost)
cat_features = [] # take it empty for the moment
dtrain = Pool(df_Xtrain, ser_ytrain, cat_features=cat_features)
dvalid = Pool(df_Xvalid, ser_yvalid, cat_features=cat_features)
dtest = Pool(df_Xtest, ser_ytest, cat_features=cat_features)
params_cat = {'iterations': 100, 'verbose': False,
'random_seed': 0,
'eval_metric':'AUC',
'loss_function':'Logloss',
'cat_features': [],
'ignored_features': [],
'early_stopping_rounds': 200,
'verbose': 200,
}
bst_cat = CatBoost(params=params_cat)
bst_cat.fit(dtrain,
eval_set=(df_Xvalid, ser_yvalid),
use_best_model=True,
plot=True);
print(bst_cat.eval_metrics(dtest, ['AUC'])['AUC'][-1])
cv(pool=None, params=None, dtrain=None, iterations=None,
num_boost_round=None, fold_count=None, nfold=None, inverted=False,
partition_random_seed=0, seed=None, shuffle=True, logging_level=None,
stratified=None, as_pandas=True, metric_period=None, verbose=None,
verbose_eval=None, plot=False, early_stopping_rounds=None,
save_snapshot=None, snapshot_file=None,
snapshot_interval=None, folds=None, type='Classical')
params = {'iterations': 100, 'verbose': False,
'random_seed': 0,
'loss_function':'Logloss',
'eval_metric':'AUC',
}
df_scores = catboost.cv(dtrain,
params,
fold_count=2,
verbose=100,
shuffle=True,
stratified=True,
plot="True") # plot does not work in google colab
print(df_scores.columns)
df_scores.head()
sns.lineplot(x='iterations',y='train-Logloss-mean',data=df_scores,ax=ax,color='r')
sns.lineplot(x='iterations',y='test-Logloss-mean',data=df_scores,ax=ax,color='b',alpha=0.2,linewidth=5,linestyle='--')
plt.show()
We generally should optimize model complexity and then tune the convergence.
model complexity: max_depth etc convergence: learning rate
Parameters:
model = CatBoostClassifier(verbose=100,random_state=SEED)
model.fit(df_Xtrain, ytr)
ypreds = model.predict(df_Xtest)
cm = confusion_matrix(ytest, ypreds)
print(cm)
params = dict(verbose=500,
random_state=0,
iterations=3_000,
eval_metric='AUC',
cat_features = [],
early_stopping_rounds=200,
)
model = catboost.CatBoostClassifier(**params)
model.fit(df_Xtrain, ytrain,
eval_set=(df_Xvalid, yvalid),
use_best_model=True,
plot=False
);
time_start = time.time()
model = CatBoostClassifier(verbose=False,random_state=0,iterations=50)
model.fit(df_Xtrain, ser_ytrain)
ypreds = model.predict(df_Xtest)
cm = confusion_matrix(ytest, ypreds)
error = cm[0,1] + cm[1,0]
time_taken = time.time() - time_start
print('Time taken: {:.0f} min {:.0f} secs'.format(*divmod(time_taken,60)))
print('Errro confusion matrix', error)
# using 50 iterations is worse, use previous 1000.
for n in [6]: # default detpth = 6
model = CatBoostClassifier(verbose=False,random_state=0,
iterations=1_000,
depth=n,
)
model.fit(Xtr, ytr)
ypreds = model.predict(Xtx)
cm = confusion_matrix(ytest, ypreds)
error = cm[0,1] + cm[1,0]
print(f'Confusion matrix error count = {error} for n = {n}')
for n in [0]:
model = CatBoostClassifier(verbose=False,random_state=n,
depth=6,
iterations=1_000,
)
model.fit(Xtr, ytr)
ypreds = model.predict(Xtx)
cm = confusion_matrix(ytest, ypreds)
error = cm[0,1] + cm[1,0]
print(f'Confusion matrix error count = {error} for n = {n}')
import optuna
optuna.logging.set_verbosity(optuna.logging.WARNING) # use INFO to see progress
def objective(trial):
params_cat_optuna = {
'objective': trial.suggest_categorical('objective', ['Logloss', 'CrossEntropy']),
'colsample_bylevel': trial.suggest_uniform('colsample_bylevel', 0.01, 0.1),
'depth': trial.suggest_int('depth', 1, 12),
'boosting_type': trial.suggest_categorical('boosting_type', ['Ordered', 'Plain']),
'bootstrap_type': trial.suggest_categorical('bootstrap_type',
['Bayesian', 'Bernoulli', 'MVS']),
'used_ram_limit': '3gb'
}
# update parameters
if params_cat_optuna['bootstrap_type'] == 'Bayesian':
params_cat_optuna['bagging_temperature'] = trial.suggest_uniform('bagging_temperature', 0, 10)
elif params_cat_optuna['bootstrap_type'] == 'Bernoulli':
params_cat_optuna['subsample'] = trial.suggest_uniform('subsample', 0.1, 1)
# fit the model
model = CatBoostClassifier(random_state=SEED,**params_cat_optuna)
model.fit(df_Xtrain, ser_ytrain,
eval_set=[(df_Xvalid, ser_yvalid)],
verbose=0,
early_stopping_rounds=100)
ypreds = model.predict(df_Xvalid)
ypreds = np.rint(ypreds)
score = roc_auc_score(ser_yvalid.to_numpy().ravel(),
ypreds)
return score
# NOTE: there is inherent non-determinism in optuna hyperparameter selection
# we may not get the same hyperparameters when run twice.
sampler = optuna.samplers.TPESampler(seed=SEED)
N_TRIALS = 1 # make it large
study = optuna.create_study(direction='maximize',
sampler=sampler,
study_name='cat_optuna',
storage='sqlite:///cat_optuna_fraud_detection.db',
load_if_exists=True)
study.optimize(objective, n_trials=N_TRIALS,timeout=600)
# Resume from last time
sampler = optuna.samplers.TPESampler(seed=SEED)
N_TRIALS = 1 # make it large
study = optuna.create_study(direction='maximize',
sampler=sampler,
study_name='cat_optuna',
storage='sqlite:///cat_optuna_fraud_detection.db',
load_if_exists=True)
study.optimize(objective, n_trials=N_TRIALS)
print(f'Number of finished trials: {len(study.trials)}')
# best trail
best_trial = study.best_trial
# best params
params_best = study.best_trial.params
params_best
# time
time_start = time.time()
model_name = 'catboost'
desc = 'grid search optuna'
Xtr = df_Xtrain_orig
ytr = ser_ytrain_orig.to_numpy().ravel()
Xtx = df_Xtest
ytx = ser_ytest.to_numpy().ravel()
Xvd = df_Xvalid
yvd = ser_yvalid.to_numpy().ravel()
# use best model
params_best = study.best_trial.params
clf_lgb = clf_lgb = CatBoostClassifier(random_state=SEED,verbose=False)
clf_lgb.set_params(**params_best)
# fit and save the model
clf_lgb.fit(Xtr, ytr)
joblib.dump(clf_lgb,'../outputs/clf_cat_grid_search_optuna.pkl')
# load the saved model
clf_lgb = joblib.load('../outputs/clf_cat_grid_search_optuna.pkl')
# predictions
skf = StratifiedKFold(n_splits=2,shuffle=True,random_state=SEED)
ypreds_cv = cross_val_predict(clf_lgb, Xtx, ytx, cv=skf)
ypreds = ypreds_cv
# model evaluation
average = 'binary'
row_eval = [model_name,desc,
accuracy_score(ytx, ypreds),
precision_score(ytx, ypreds, average=average),
recall_score(ytx, ypreds, average=average),
f1_score(ytx, ypreds, average=average),
roc_auc_score(ytx, ypreds),
]
df_eval.loc[len(df_eval)] = row_eval
df_eval = df_eval.drop_duplicates()
time_taken = time.time() - time_start
print('Time taken: {:.0f} min {:.0f} secs'.format(*divmod(time_taken,60)))
display(df_eval)
df_eval.sort_values('Recall',ascending=False).style.background_gradient(subset='Recall')
cm = confusion_matrix(ytest,ypreds)
vals = cm.ravel()
cm
print('Catboost Grid Search Results')
print('-'*25)
print('Total Frauds: ', vals[2] + vals[3])
print('Incorrect Frauds: ', vals[2])
print('Incorrect Percent: ', round(vals[2]*100/(vals[2]+vals[3]),2),'%')
from bhishan.bhishan import plotly_binary_clf_evaluation
yprobs = model.predict_proba(df_Xtest)
yprobs = yprobs[:,0] # take only first column
plotly_binary_clf_evaluation('clf_lgb_optuna',model,ytx,ypreds,yprobs,df)
yprobs
model = CatBoostClassifier(verbose=False,random_state=100,
depth=6,
iterations=1_000,
)
model.fit(Xtr, ytr)
ypreds = model.predict(Xtx)
cm = confusion_matrix(ytest, ypreds)
error = cm[0,1] + cm[1,0]
print(f'Confusion matrix error count = {error} for n = {n}')
print(cm)
df_Xtrain.head(2).append(df_Xtest.head(2))
import eli5
eli5.show_weights(model)
from eli5.sklearn import PermutationImportance
feature_names = df_Xtrain.columns.tolist()
perm = PermutationImportance(model).fit(df_Xtest, ytx)
eli5.show_weights(perm, feature_names=feature_names)
import lime
import lime.lime_tabular
idx = 0
example = df_Xtest.iloc[idx]
answer = ser_ytest.iloc[idx]
feature_names = df_Xtest.columns.tolist()
prediction = model.predict(example.to_numpy().reshape(-1,1).T)
print(f'answer = {answer}')
print('prediction = ', prediction[0])
print()
print(example)
print(feature_names)
import lime
import lime.lime_tabular
categorical_features = []
categorical_features_idx = [df_Xtrain.columns.get_loc(col) for col in categorical_features]
explainer = lime.lime_tabular.LimeTabularExplainer(df_Xtrain.to_numpy(),
feature_names=feature_names,
class_names=['Not-fraud','Fraud'],
categorical_features=categorical_features_idx,
mode='classification')
exp = explainer.explain_instance(example, model.predict_proba, num_features=8)
exp.show_in_notebook(show_table=True)
exp.as_pyplot_figure(); # use semicolon
import shap
shap.initjs()
model = CatBoostClassifier(verbose=100,random_state=100)
model.fit(df_Xtrain, ytrain)
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(df_Xtest)
df_Xtest.head(1)
df_Xtest.head(1)['V15 V18 V3 V24 V1 V8 V4 V14 V2 V6 V9 V20'.split()].round(4)
# Look only first row of test data
# use matplotlib=True to avoid Javascript
idx = 0
shap.force_plot(explainer.expected_value,
shap_values[idx,:],
df_Xtest.iloc[idx,:],
matplotlib=False,
text_rotation=90)
# for this row, the predicted label is -9.33
# red features makes it higher
# blue features makes it smaller.
NUM = 100
shap.force_plot(explainer.expected_value, shap_values[:NUM,:],
df_Xtest.iloc[:NUM,:],matplotlib=False)
shap.summary_plot(shap_values, df_Xtest)
shap.summary_plot(shap_values, df_Xtest, plot_type='bar')
shap.dependence_plot("Amount", shap_values, df_Xtest)
shap.dependence_plot(ind='Time', interaction_index='Amount',
shap_values=shap_values,
features=df_Xtest,
display_features=df_Xtest)
notebook_end_time = time.time()
time_taken = time.time() - notebook_start_time
h,m = divmod(time_taken,60*60)
print('Time taken to run whole noteook: {:.0f} hr {:.0f} min {:.0f} secs'.format(h, *divmod(m,60)))